import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, ArrayType, StringType, StructField, IntegerType, BooleanType, FloatType
from pyspark import find_spark_home

os.environ.setdefault('HADOOP_HOME', 'D:\\hadoop-2.9.2')

# Driver
spark = SparkSession \
    .builder \
    .master('local') \
    .appName('HelloSpark') \
    .getOrCreate()

# 1. 读取csv文件
# 2. 读取JSON文件
# 3. 读取数据库文件
# 4. text普通文本
# 5. orc/parquest 读取hadoop的文件
# 6. table hive读数据
# df = spark.read.format('csv') \
#     .option('header', True) \
#     .load('dataset/BeijingPM20100101_20151231.csv')

# df = spark.read.option('multiline', True).json('dataset\\season.json')
# df.show(truncate=False)

# 3. 连接mysql数据库
employee_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://master:3306/hr") \
    .option("dbtable", "employees") \
    .option("user", "root") \
    .option("password", "mysqlroot") \
    .load()

employee_df.select(rank()).show(truncate=False)
employee_df.show(truncate=False)

employee_df.write.json('result')